# Synthesize_and_gapfill_functions.R
#
# Functions for synthesizing or gapfilling timeseries.
#
# author: Anna Ukkola UNSW Jun 2017


#' Gapfills met data
#' @return out
gapfill_with_ERA <- function(datain, era_data, era_vars, tair_units, vpd_units,
                             out_vars, qc_name, qc_flags, varnames, 
                             site_log){
  
  #ERAinterim estimates are provided for TA, SW_in,
  #LW_IN, VPD, PA, P and WS
  #Gapfill observed met data using these estimates
  #And create QC variables if not available to
  #save information about which time steps ERA-gapfilled
  
  #Check that Fluxnet and ERA data dimensions agree
  if(nrow(datain) != nrow(era_data)) {
    error <- paste("Observed flux data and ERAinterim data dimensions",
                   "do not match, aborting [ function:", match.call()[[1]], "]")
    stop_and_log(error, site_log)
    return(site_log)
  }
  
  
  #List available ERA variables
  ind  <- which(!is.na(era_vars))
  
  avail_era  <- era_vars[ind]
  avail_flux <- colnames(datain)[ind]
  avail_out  <- out_vars[ind]
  
  #Initialise list for new QC variables
  #created if no existing QC flags for a variable
  new_qc     <- vector()
  qc_names   <- vector()
  qc_outname <- vector()
  
  #Save method if any tsteps gapfilled
  method <- vector("list", length=length(ind))
  names(method) <- names(ind)
    
  #Loop through available variables
  for(k in 1:length(avail_era)){
    
    #Initialise method
    method[[k]] <- vector()
    
    #Find flux data column index and ERAinterim column index
    #for variable being processed
    flx_col <- which(colnames(datain)==avail_flux[k])
    
    era_col <- which(colnames(era_data)==avail_era[k])
    
    #If gaps in met data variable, gapfill
    if(any(is.na(datain[,flx_col]))){
      
      #Find missing values to fill
      missing <- which(is.na(datain[,flx_col]))
      
      
      ### Relative humidity ###
      #If Flux variable relative humidity, but ERA variable VPD, convert
      if(avail_flux[k] %in% varnames$relhumidity & avail_era[k]=="VPD_ERA"){
        
        era_tair_col <- which(colnames(era_data)=="TA_ERA")
        
        if(length(era_tair_col) == 0){
          error <- paste("Cannot find ERAinterim air temperature data.",
                         "Cannot convert ERA VPD to relative humidity")
          stop_and_log(error, site_log)
          return(site_log)
        }
        
        # Convert ERAinterim VPD to relative humidity
        # Assuming that ERA vpd and tair units the same as observed units
        era_rh <-  VPD2RelHum(VPD=era_data[,era_col], airtemp=era_data[,era_tair_col],
                              vpd_units=vpd_units, tair_units=tair_units, site_log)
        
        #Gapfill
        datain[missing,flx_col] <- era_rh[missing]
        
        
        ### Other variables ###
        #ERAinterim equivalent should exist, use that directly
      } else {
        
        #Gapfill
        datain[missing, flx_col] <- era_data[missing, era_col]
        
      }
      
      #Save method
      method[[k]] <- append(method[[k]], "ERA-Interim")
      
      ## Set QC flags to "4" for time steps filled with ERA data ##
      #Find corresponding qc variable, if available
      qc_col <- which(colnames(datain)==paste(avail_flux[k], qc_name, sep=""))
      
      
      #Replace era gap-filled time steps with ERA QC flag
      if(length(qc_col) > 0){
        datain[missing,qc_col] <- qc_flags$QC_gapfilled["ERA"]
        
      } else {
        message(paste("Could not find QC flag for variable", 
                      avail_flux[k], "gap-filled with ERA data. Creating QC flag."))
        
        #Initialise QC flag with zeros and replace with ERA value where gap-filled
        #(use first value of QC_measured as OzFlux contains several obs qc values)
        qc_var <- rep(qc_flags$QC_measured[1], length(datain[,flx_col]))
        qc_var[missing] <- qc_flags$QC_gapfilled["ERA"]
        
        #Create name for QC variable and save data and name to data.frame
        #Use output variable name to create qc flag name
        qc_names   <- cbind(qc_names, paste(avail_flux[k], qc_name, sep=""))
        new_qc     <- cbind(new_qc, qc_var)
        qc_outname <- cbind(qc_outname, avail_out[k])
        
      }   
    } #if
  } #vars
  
  
  #Assign new QC variable names to data frame column names
  if(length(new_qc) > 0){ 
    colnames(new_qc) <- qc_names
  } 
  new_qc_info <- list(new_qc, qc_outname)
  names(new_qc_info) <- c("data","outname")

  #Collate outputs
  out <- list(datain=datain, new_qc=new_qc_info, method=method)
  
  return(out)
  
} #function


#-----------------------------------------------------------------------------

#' Performs linear interpolation gapfilling
linfill_data <- function(data, tstepsize,
                        linfill=10){
  
  #Max number of consecutive time steps allowed
  #to be missing for linfill
  max_gap <- (linfill*60*60)/tstepsize
  
  #All missing values
  missing <- which(is.na(data))
  
  consec <- seqToIntervals(missing)
  
  #Find consecutive periods shorter/equal to max_gap
  tperiods <- which((consec[,2]-consec[,1]) <= max_gap)
  
  missing_linfilled <- vector()
  
  if(length(tperiods) > 0){
    
    for(k in tperiods){
      
      start <- consec[k,1]
      end   <- consec[k,2]
      
      #If first time step of data series, use
      #first available value
      if(start==1){
        data[start:end] <- data[end+1]
      
      #If last time step of data series,
      #use last available value
      } else if(end==length(data)){
        data[start:end] <- data[start-1]
        
      #Else linearly interpolate
      } else {
        data[start:end] <- (data[start-1] + data[end+1])/2
      }
   
      #Save time steps linearly interpolated
      missing_linfilled <- append(missing_linfilled, c(start:end))
    } 
    
  }
  
  
  #Return gap-filled data and index of missing values
  return(list(data=data, missing=missing_linfilled))  
  
}

#-----------------------------------------------------------------------------

#' Performs copyfill gapfilling
copyfill_data <- function(data, tsteps, tstepsize, copyfill=10,
                          varname, site_log){
  
  
  #Max number of consecutive time steps allowed
  # to be missing
  max_gap <- (copyfill*60*60*24)/tstepsize
  
  
  #First check that no gaps are longer than "copyfill"
    
  #Find missing values
  missing <- which(is.na(data))
  
  #If found missing, gapfill
  if(length(missing) > 0){
    
    consec <- seqToIntervals(missing)
    consec <- matrix(consec, ncol=2) #convert to matrix
    
    #One or several gaps too large, warn
    if(any(consec[,2] - consec[,1] + 1 > max_gap)){
    
      warn <- paste("Data gap too long in variable ", varname,
                    " to be fully gapfilled. Largest gap is ",
                    max(consec[,2] - consec[,1] + 1) / (60 * 60 * 20) * tstepsize,
                    " days, but maximum consecutive gap is set to ", copyfill, " days.",
                    "Amend parameter 'conv_opts$copyfill' to change this.", sep="")
      site_log <- warn_and_log(warn=warn, site_log=site_log)
    }
    
    #Only use missing indices for time periods shorter than copyfill
    rm_ind <- which((consec[,2] - consec[,1] + 1) > max_gap)
    if(length(rm_ind) > 0) { 
      consec <- consec[-rm_ind,] 
    }
    
    #convert to matrix
    consec <- matrix(consec, ncol=2) 
    
    #Create sequences
    seq    <- apply(consec, MARGIN=1, function(x) seq(from=x[1], to=x[2]))
    
    #Save time periods shorter than copyfill for use in gapfilling
    missing_all <- unlist(seq)

      
    #If found gaps < copyfill, gapfill
    if(length(missing_all) > 0){
      #Loop through missing values
      for(k in missing_all){
        
        #Find same time step for other years
        eqv_tsteps <- which(tsteps==tsteps[k])
        
        #Find data for the same time steps, set missing to NA
        fill_data <- data[eqv_tsteps]
        
        #Calculate average
        fill_value <- mean(fill_data, na.rm=TRUE)
        
        #Replace missing value with this
        data[k] <- fill_value   
      } 
    }
  }
    
  
  #Return gap-filled data and index of missing values
  return(list(data=data, missing=missing, site_log=site_log))
  
}


#-----------------------------------------------------------------------------
#' Synthesizes incoming longwave radiation and air pressure
gapfill_LWdown_Pair <- function(data, var, var_ind, TairK=NA, RH=NA, 
                               technique=NA, elev=NA, varnames, site_log){
  
  #Get data to gapfill and Tair
  data_to_fill <- data$data[,names(var_ind)]
  
  tair <- data$data[,names(TairK)]

  #Get Tair units 
  tair_units <- data$units$original_units[names(TairK)]
  
  #Convert Tair if necessary (need Kelvin)
  if(tair_units == "C"){
    tair       <- celsius_to_kelvin(tair)
    tair_units <- "K"
  } else if (tair_units != "K"){
    error <- paste("Cannot recognise air temperature units for",
                   "synthesizing LWdown and/or air pressure,", 
                   "use Celsius (C) or Kelvin (K).")
    stop_and_log(error, site_log)
  }
  
  
  ### LWdown ###
  if (var=="LWdown") {
    
    #Extract rel humidity data    
    rh   <- data$data[,names(RH)]
    
    #First check that have relative humidity in %, not VPD
    if (names(RH) %in% varnames$vpd) {
      vpd_units  <- data$units$original_units[names(RH)]
      rh         <- VPD2RelHum(VPD=rh, airtemp=tair, vpd_units, tair_units, site_log)
    }
    
    #Find missing indices
    missing <- which(is.na(data_to_fill))
    
    if (length(missing) > 0) {
    
      #Synthesize
      for (i in missing) {
        data_to_fill[i] <- SynthesizeLWdown(tair[i], rh[i], technique)  
      }

    }
      
      
  ### Air pressure ### 
  } else if (var=="Pair") {
    
    #Find missing indices
    missing <- which(is.na(data_to_fill))
    
    if (length(missing) > 0) {
      
      #Synthesize
      for (i in missing) {
        data_to_fill[i] <- SynthesizePSurf(tair[i], elev, data$units$original_units[names(data$units$original_units) %in% 
                                                                                      varnames$airpressure])
      }
    }
  }
  
  #Return gap-filled data and index of missing values
  outs <- list(data=data_to_fill, missing=missing)
  
  return(outs)

}


#-----------------------------------------------------------------------------

#' Gapfills flux data using linear regression against met variables
regfill_flux <- function(ydata, traindata, tstepsize, regfill, varname, 
                         swdown_ind, tair_ind, rh_ind,
                         site_log, ...) {
  

  #Max number of consecutive time steps allowed
  # to be missing
  max_gap <- (regfill*60*60*24)/tstepsize
  
  missing_all <- vector()
  
  #First check that no gaps are longer than "regfill"
    
  #Find missing values
  missing <- which(is.na(ydata))
  
  if (length(missing) > 0) {
    
    consec <- seqToIntervals(missing)
    
    #One or several gaps too large, return warning
    if (any(consec[,2] - consec[,1] + 1 > max_gap)) {
      
      warn <- paste("Data gap too long in variable ", varname,
                    " to be fully gapfilled. Largest gap is ",
                    max(consec[,2] - consec[,1] + 1) / (60 * 60 * 20) * tstepsize,
                    " days, but maximum consecutive gap is set to ", regfill, " days.",
                    " Amend parameter 'conv_opts$regfill' to change this.", sep="")
      site_log <- warn_and_log(warn, site_log)
    }
    
    
    #Only use missing indices for time periods shorter than regfill
    rm_ind <- which((consec[,2] - consec[,1] + 1) > max_gap)
    if (length(rm_ind) > 0) { 
      consec <- consec[-rm_ind,] 
    }
    
    #convert to matrix
    consec <- matrix(consec, ncol=2) 
    
    #Create sequences
    seq    <- apply(consec, MARGIN=1, function(x) seq(from=x[1], to=x[2]))
    
    #Append to missing
    missing_all <- append(missing_all, unlist(seq))    
    
  }
  
    
  
  #If found missing values:
  if (length(missing_all) > 0) {  
    
    #SWdown, Tair and humidity available
    if (all(is.finite(c(swdown_ind, tair_ind, rh_ind)))) {
      
      #Collate training data
      train_data <- as.matrix(cbind(traindata[swdown_ind], traindata[tair_ind],
                                    traindata[rh_ind]))
      colnames(train_data) <- c("SWdown", "Tair", "RH")
      
      #Only SWdown available
    } else if (is.finite(swdown_ind)){
      
      #Collate training data
      train_data <- as.matrix(traindata[swdown_ind])
      colnames(train_data) <- c("SWdown")
      
      #None available, return
    } else {
      
      #Log warning to site log that no met variables were available to gapfill fluxes
      warn <- paste("Cannot perform regression gapfilling of flux variables",
                    "no SWdown or PAR available")
      
      site_log <- log_warning(warn, site_log)
      outs     <- list(data=ydata, site_log=site_log, method=NA, missing=c())
      return(outs)
    }
        

    #Obtain regression parameters, separately for day and night
    reg_params <- regtrain(train_data, ydata, ...)
    
    #Predict y using regression
    predicted_y <- regpredict(reg_params$rgrp, reg_params$traindata,
                              reg_params$dayn)

    #Replace missing values with predicted values
    ydata[missing_all] <- predicted_y[missing_all]
   
    #Save method
    method <- colnames(train_data)
  } else {
    method <- ""
  }  
  
  #Collate outputs
  outs <- list(data=ydata, site_log=site_log, method=method, missing=missing)
  
  return(outs)
  
}

#-----------------------------------------------------------------------------

#' Trains multiple linear regression for flux gap-filling 
#' separately for day and night
regtrain <- function(traindata, ydata, ...) {
    
  # Separate day and night:
  dayn <- DayNight(as.double(traindata[,"SWdown"]), ...)
  
  #Day and night training datasets
  Yday     <- ydata[dayn]
  trainDay <- traindata[dayn,]
  
  Ynight     <- ydata[!dayn]
  trainNight <- traindata[!dayn,]
  
  #Collate to dataframes
  data_day           <- cbind(Yday, trainDay)
  colnames(data_day) <- c("y", colnames(traindata))
  
  data_night           <- cbind(Ynight, trainNight)
  colnames(data_night) <- c("y", colnames(traindata))
  
  
  # Train regression parameters (dot means use all variables
  # in data frame as predictors):
  rgrp_day   <- lm(y ~ ., data=as.data.frame(data_day), na.action=na.omit)
  rgrp_night <- lm(y ~ ., data=as.data.frame(data_night), na.action=na.omit)
  
  #Return as list
  rgrp       <- list(day = rgrp_day, night = rgrp_night)
  
  return(list(rgrp=rgrp, traindata=traindata, dayn=dayn))
}

#-----------------------------------------------------------------------------

#' Predict flux values using linear regression parameters
regpredict <- function(rgrp,traindata, dayn) {
  
  # Use existing parameters to make empirical prediction:
  daycoefs   <- coef(rgrp$day)
  nightcoefs <- coef(rgrp$night)
  
  #Calculate each x * coef
  x_vals_day   <- traindata
  x_vals_night <- traindata
  
  for (k in 1:ncol(traindata)) {
    x_vals_day[,k]   <- x_vals_day[,k] * daycoefs[1+k]
    x_vals_night[,k] <- x_vals_night[,k] * nightcoefs[1+k]
  }
  
  #Predict y
  day_y   <- rowSums(x_vals_day) + daycoefs[1]
  night_y <- rowSums(x_vals_night) + nightcoefs[1]
  
  #Combine and mask day/night
  predicted_y <- dayn
  predicted_y[which(dayn)]  <- day_y[which(dayn)]
  predicted_y[which(!dayn)] <- night_y[which(!dayn)]
  
  return(predicted_y)
}

#-----------------------------------------------------------------------------

#' Synthesises downward longwave radiation based on Tair and rel humidity
SynthesizeLWdown <- function(TairK,RH,technique) {
  
  #Three techniques available, see Abramowitz et al. (2012),
  #Geophysical Research Letters, 39, L04808 for details
  
  zeroC <- 273.15
  
  #Inputs missing, set lwdown missing
  if (is.na(TairK) | is.na(RH)) {
    lwdown <- NA

  #Else synthesise value
  } else {
      
    if (technique=='Swinbank_1963') {
      # Synthesise LW down from air temperature only:
      lwdown <- 0.0000094*0.0000000567*TairK^6
      
    } else if(technique=='Brutsaert_1975') {
      satvapres <- 611.2*exp(17.67*((TairK-zeroC)/(TairK-29.65)))
      vapres    <- pmax(5,RH)/100*satvapres
      emiss     <- 0.642*(vapres/TairK)^(1/7)
      lwdown    <- emiss*0.0000000567*TairK^4
      
    } else if(technique=='Abramowitz_2012') {
      satvapres <- 611.2*exp(17.67*((TairK-zeroC)/(TairK-29.65)))
      vapres    <- pmax(5,RH)/100*satvapres
      lwdown    <- 2.648*TairK + 0.0346*vapres - 474
      
    } else {
      CheckError('S4: Unknown requested LWdown synthesis technique.')
    }
  }
  

  return(lwdown)
}

#-----------------------------------------------------------------------------

#' Synthesises air pressure based on Tair and elevation
SynthesizePSurf <- function(TairK, elevation, pair_units) {
  # Synthesizes PSurf based on temperature and elevation
  
  #If Tair missing, set Pair missing
  if (is.na(TairK)) {
    PSurf <- NA
    
  #Else synthesise (in Pa)
  } else {
    PSurf <- 101325 * (TairK / (TairK + 0.0065*elevation))^(9.80665/287.04/0.0065)
  }
  
  #Convert to kPa if necessary
  if(pair_units=="kPa"){
    PSurf <- PSurf/1000
  } else if (pair_units != "Pa"){
    stop(paste("Cannot synthesise air pressure, do not recognise air pressure units.",
               "Please use Pa or kPa"))
  }
  
  
  return(PSurf)
}



